Crime analysis has become one of the most important topics in the field of data science, since the availability of crime data sets, in addition to the need of analyzing the reasons of those crime activities and the prediction of future crimes, according to the past criminal records. A lot of online crime datasets are available nowadays, like the city of Atlanta Crime 2009-2017 dataset, city of Baltimore Crime 2011-2016 dataset and many other datasets that can be found here.
But in this project we are going to work on the Crimes in Chicago Dataset.
The objective of the activity is to answer some questions regarding criminology. Moreover, answering these questions and reaching a well-established analysis, will give us a chance to use these approaches with crimes in other cities also. The aim is to explore the dataset and find key points that can help in detecting the major areas of crime, factors behind these crimes and safe spots around the city. Moreover, we explore and answer more questions regarding performance indicators of the police in the city and the correlation between socioeconomic factors and crimes in the city.
The questions we have answered:
We found that Theft, Battery and Criminal Damage are the top crimes in Chicago.
top_10_types = crimes.primary_type.value_counts()[:10] # Sort values in ascending order
x = top_10_types.index # Get x labels and order
fig, ax = plt.subplots(figsize=figsize) # Create figure and axes with default figure size
dark_reds = get_dark_colors(101, 6, 'Reds', 10)
dark_reds.reverse()
sb_barplot = sb.barplot(ax = ax, x = top_10_types, y = x, order = x,
palette = dark_reds, edgecolor=None, linewidth=0) # Crea bar plot
sb.despine(left=True, bottom=True) # Remove plot borders
for crime, p in zip(x.values, sb_barplot.patches): # Add annotation for each patch
sb_barplot.annotate(text = crime + ' (' + format(int(p.get_width()), ',d') + ')',
xy = (p.get_width() - 40000, p.get_y() + p.get_height() * 0.57),
ha = 'left',
va = 'center',
xytext = (40, 0),
textcoords = 'offset points',
color = 'black',
size = 15)
plt.xticks(fontsize=15, color = 'black') # Increase x ticks labels font size
plt.xticks([]), plt.yticks([]) # Remove x and y ticks
plt.xlabel(''), plt.ylabel('') # Remove x and y labels
plt.title('Top 10 Crime Types in Chicago by Count', y = title_y, fontsize = title_size, color='black'); # Add descriptive title
plt.tight_layout() # Improve appearance
plt.savefig('src/top_10_types.png', transparent=True, bbox_inches='tight', pad_inches=1); # Save as png file
dard_reds = get_dark_colors(100, 7, 'Reds', 12)
dard_reds = LinearSegmentedColormap.from_list('mycmap', dard_reds)
wordcloud2 = WordCloud(width = 3000,
height = 2000,
background_color ='white',
random_state=1,
collocations=False,
min_font_size=50,
colormap='rocket').generate_from_frequencies(crimes_counts)
plt.figure(figsize=figsize)
plt.imshow(wordcloud2)
plt.axis("off")
plt.title('WordCloud by All the Crime Types', y = 1.16, fontsize=40, color = '#c60000') # Set title
plt.tight_layout(pad = 0)
plt.savefig('src/word_cloud.png', bbox_inches='tight', facecolor='white', pad_inches=1); # Save as png file
Theft crimes are active in streets and parking garages, Narcotic crimes are active in sidewalks and alleys, while Battery crimes are active in schools, public buildings, residences and apartments.
def set_4_plots(start_edge, end_edge, save_num):
# Create figure with 3*2 subplots
fig, axes_sub = plt.subplots(2, 2, figsize=[20, 9]) # Create figure and axes with default figure size
fig.subplots_adjust(hspace=0.6, wspace=0.05)
# Flatten axes array
axes = axes_sub.flatten()
# Add common title
fig.suptitle('The top 4 Crimes for The Most Dangerous Locations in Chicago',
fontsize=title_size, y = title_y+0.05, color = 'black', x = 0.58)
color_dic = get_my_colors('Set2', 10, top_primary_types)
for ax_n, place in enumerate(danger_plcaes.sort_values()[start_edge : end_edge]):
ax = axes[ax_n] # Current axis
# Subsetting the data to the current crime
place_df = top_danger_plcaes[top_danger_plcaes['location_description'] == place]
types = place_df.primary_type.values
color_palette = {}
for t in types:
color_palette[t] = color_dic[t]
# Add barplot
barplot = sb.barplot(ax = ax, x = list(place_df.primary_type), y = place_df['count'],
palette = color_palette, edgecolor=None, linewidth=0)
# Add title for the current plot
ax.set_title(place, fontsize = 20, color = 'black', y = 1.11)
# Remove labels from axes
ax.set_xlabel(''), ax.set_ylabel(''), ax.set_yticklabels(''), ax.set_xticklabels('')
ax.set_facecolor('xkcd:white') # Setting face color to white
sb.despine(left=True, bottom=True) # Remove plot borders
for p in barplot.patches: # Add annotation for each patch
barplot.annotate(text = format(int(p.get_height()), ',d'),
xy = (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points',
color='black',
size = 15)
add_custome_legend(color_dic=color_dic, fontsize=23, x=1.12, y=0.768, fig=fig, handle='patch')
#plt.tight_layout() # Improve appearance
plt.savefig(f'src/top_danger_plcaes{save_num}.png', transparent=True, bbox_inches='tight', pad_inches=1); # Save as png file
set_4_plots(0, 4, 1)
set_4_plots(4, 9, 2)
Battery crimes are the most common crimes related to Domestic Violence.
top_domestic_crimes = crimes[crimes['domestic']].reset_index(drop=True).primary_type.value_counts()[:6].sort_values()
x = list(top_domestic_crimes.index.values) # Get x labels and order
fig, ax = plt.subplots(figsize=figsize) # Create figure and axes with default figure size
dark_blues = get_dark_colors(50, 4, 'Blues', 6)
sb_barplot = sb.barplot(ax = ax, x = x, y = top_domestic_crimes, order = x,
palette = dark_blues, edgecolor=None, linewidth=0) # Crea bar plot
sb.despine(left=True, bottom=True) # Remove plot borders
for p in sb_barplot.patches: # Add annotation for each patch
sb_barplot.annotate(text = format(int(p.get_height()), ',d'),
xy = (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points',
color='black',
size = 15)
plt.xticks(fontsize=15, color = 'black') # Increase x ticks labels font size
plt.yticks([]), plt.xlabel(''), plt.ylabel('') # Remove ticks and label
plt.title('Top Domestic Crimes by Count', y = title_y, fontsize = title_size); # Add descriptive title
plt.tight_layout() # Improve appearance
plt.savefig('src/top_domestic_crimes.png', transparent=True, bbox_inches='tight', pad_inches=1); # Save as png file
We found a decrease in police performance from 2014 to 2016 despite a decrease in the total number of crimes in those years.
fig, ax = plt.subplots(1, 1, figsize=figsize) # Create figure and axes with default figure size
ax_color, ax2_color = '#990011FF', '#1D65A6'
# Add common title
fig.suptitle('Police Performance VS. Total Crimes Over Years', fontsize = title_size, y = title_y, color = 'black')
sb.lineplot(ax = ax, x = list(success_ratio.index), y = success_ratio, ci=0, color=ax_color)
x_ticks = ax.get_xticks()
x_ticks[0] = 2000.5
x_ticks_labels = [str(int(x)) for x in x_ticks]
x_ticks_labels[0], x_ticks_labels[-1] = '', ''
ax.set_xticks(x_ticks), ax.set_xticklabels(x_ticks_labels, fontsize=15, color = 'black')
set_y_ticks(ax=ax, up_edge=30, step=10, str_suff='%', color='black')
ax2 = ax.twinx()
sb.lineplot(ax = ax2, x = list(tot_crimes_year.index), y = tot_crimes_year, ci=0, color=ax2_color)
set_y_ticks(ax=ax2, up_edge=200000, step=100000, str_suff='k', color='black')
ax.set_xlabel('')
ax.set_ylabel('Arrest Ratio', fontsize=17, color='black', labelpad=20)
ax2.set_ylabel('Total Crimes', fontsize=17, color='black', labelpad=20)
ax.set_facecolor('xkcd:white') # Setting face color to white
legend_colors = {'Successful Arrest Ratio': ax_color,
'Total Crimes': ax2_color}
add_custome_legend(color_dic=legend_colors, fontsize=15, x=0.6 ,y=1, fig=fig, handle='line')
plt.tight_layout() # Improve appearance
plt.savefig('src/success_aresst_ratio.png', transparent=True, bbox_inches='tight', pad_inches=1); # Save as png file
July and August (summer months) are the most common months for crime while February and March are the least. On the other hand, Saturday and Sunday (weekends) are the least common days for crimes.
def set_bar_plot(data, x, color):
fig, ax = plt.subplots(figsize=figsize) # Create figure and axes with default figure size
base_color = sb.color_palette()[0]
sb_barplot = sb.countplot(x=x, data=data, color=color, edgecolor=None, linewidth=0) # Create bar plot
sb.despine(left=True, bottom=True) # Remove plot borders
for p in sb_barplot.patches: # Add annotation for each patch
sb_barplot.annotate(text = format(int(p.get_height()), ',d'),
xy = (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points',
color='black',
size = 15)
plt.xticks(fontsize=15, color = 'black') # Increase x ticks labels font size
plt.yticks([]), plt.xlabel(''), plt.ylabel('') # Remove ticks and label
plt.title(f'Money-Driven Crimes By {x.title()}', y = title_y+.2, fontsize = title_size); # Add descriptive title
plt.tight_layout() # Improve appearance
legend = plt.legend(labels=['Theft, Burglary, Robbery and Motor Vehicle Theft Crimes'],
bbox_to_anchor=(0.76, title_y+.2), fontsize = 20, handlelength=0, handletextpad=0)
plt.savefig(f'src/money_crimes_{x}.png', transparent=True, bbox_inches='tight', pad_inches=1); # Save as png file
set_bar_plot(data=money_crimes, x='month', color='#4c3277')
set_bar_plot(data=money_crimes, x='weekday', color='#9e1030ff')
Crimes are centred in Chicago in the east while it's moderate in the south and less in the north. There are at least 1,000 crime for each location with only 1 or 2 locations that exceeding 10,000 crimes.
num_of_degrees = len(gdf.severity.unique())
color_pal = ['#F60909', '#CD1CD3', '#0F7CC8', '#064085', '#7C5F0B', '#000000'] # Custom colors
mycmap = LinearSegmentedColormap.from_list('mycmap', color_pal) # Custom Colormap
color_pal.reverse() # Reverse Colormap
ax = chicago.plot(color='white', edgecolor='black', figsize=[6, 6.9])
# Assign color for each degree of severity
for n, c in zip(range(1,num_of_degrees+2), color_pal):
if n in gdf['severity'].unique():
gdf_current = gdf.query(f'severity == {n}')
gdf_current.plot(ax=ax, color=c, marker='x', markersize=50)
plt.xticks(ticks=[]), plt.yticks(ticks=[]) # Remove x and y ticks
plt.title('The Most Dangerous Locations in Chicago between 2001 and 2017', y = title_y, fontsize=20) # Set title
sb.despine(ax=ax, left=True, bottom=True) # Remove plot borders
# Create custome colorbar
vmin, vmax = -0.5, 4.25
fig = ax.get_figure()
cax = fig.add_axes([0.97, 0.16, 0.015, 0.695])
sm = plt.cm.ScalarMappable(cmap=mycmap, norm=plt.Normalize(vmin=vmin, vmax=vmax))
# fake up the array of the scalar mappable. Urgh...
sm._A = [0, 1, 2, 3, 4]
cbar = fig.colorbar(sm, cax=cax)
cbar.set_ticks(np.arange(0, 5, 0.8))
cbar.set_ticklabels(['2K', '4K', '6K', '8K', '10K', '12K'])
cbar.outline.set_visible(False) # Remove colorbar border color
plt.savefig(f'src/top_danger_map.png', transparent=True, bbox_inches='tight', pad_inches=1); # Save as png file
